#### MASTER FILE
## From Unemployment to Self-employment: An Evaluation of Self-employment Assistance Programs
## by Alexandre Gaillard & Sumudu Kankanamge 
## June 2023.



## This file produce all the results of the paper, including the online Appendix

## Library
library(plyr)
library(data.table)
library(reldist)
library(fixest)
library(tidyverse)
library(cdlTools)
library(stargazer)
library(lubridate)
library(ggpubr)
library(ggplot2)
library(foreign)
library(nnet)
library(stargazer)
library(gtsummary)
library(mlogit)
library(VGAM)
library(haven)
library(questionr)
library(AER)
library(broom)
library(survey)

## Sample selection
years_selected_transition  = c(1994:2015)

#### Table Bimonthly Gross Flow Rates (CPS)  #### 
load(file=paste0("/Users/alexandregaillard/Dropbox (Personal)/JOLE_revision/final_files_JOLE/Data/CPS/CPS_data_couples_quarter_v2023_ELMM.RData"))

colnames(final_data$PAIDEMP1)
table(final_data$year)

b = final_data[,c("month_id","id_month","couple","unique_id","YEAR","AGE","WTFINL","jobstatus","jobstatus4")]

b$super_id2 = ifelse(b$month_id == "3", (as.numeric(b$id_month)-2)*10000 + as.numeric(b$month_id) + as.numeric(b$couple)*1000000, (as.numeric(b$id_month))*10000 + as.numeric(b$month_id) + as.numeric(b$couple)*1000000)
b$super_id2 = ifelse(b$month_id == "2", (as.numeric(b$id_month)-1)*10000 + as.numeric(b$month_id) + as.numeric(b$couple)*1000000, (as.numeric(b$id_month))*10000 + as.numeric(b$month_id) + as.numeric(b$couple)*1000000)
b           = b[order(b$super_id2),]
b           = b[order(b$unique_id),]
table(b$month_id)

b = b[which(b$YEAR %in% years_selected_transition),]
id_month_all = names(table(b$id_month))
## remove the first two month of month_id == 2
## remove the last two month of month_id == 1
b = b[which(!(b$id_month %in% id_month_all[c(1,2,3)] & b$month_id == 2)),]
b = b[which(!(b$id_month %in% id_month_all[c((length(id_month_all)-2),(length(id_month_all)-1),length(id_month_all))] & b$month_id == 1)),]
table(b$YEAR)

# remove_ID = b$unique_id[which(!(b$AGE %in% select_age_reg) & b$month_id == 1)]
# b = b[which(!(b$unique_id %in% remove_ID)),]
# remove_ID = b$unique_id[which(!(b$AGE %in% select_age_reg) & b$month_id == 2)]
# b = b[which(!(b$unique_id %in% remove_ID)),]
# table(b$AGE)
# head(b)
# table(b$month_id)

## check wrong match...
# Iterate over unique_id values and check the consistency of AGE
consistent_unique_ids <- b %>%
  group_by(unique_id) %>%
  summarize(age_range = max(AGE) - min(AGE)) %>%
  filter(age_range <= 1) %>%
  pull(unique_id)

# Get the inconsistent unique_id values
inconsistent_unique_ids <- setdiff(unique(b$unique_id), consistent_unique_ids)
b = b[which(!(b$unique_id %in% inconsistent_unique_ids)),]
table(b$month_id)

# Compute the transition between occupations 
lfsta_emp         = b$jobstatus4   ## variable for transition flows.
lfsta_emp_select  = b$jobstatus   ## variable for transition flows.
weight            = b$WTFINL        ## weights.
lfsta_emp1        = lfsta_emp[2:(length(lfsta_emp) - 1)]   
lfsta_emp         = lfsta_emp[1:(length(lfsta_emp) - 2)]
lfsta_emp_select1 =  lfsta_emp_select[2:(length(lfsta_emp_select) - 1)]
lfsta_emp_select  =  lfsta_emp_select[1:(length(lfsta_emp_select) - 2)]
weight1           = weight[2:(length(weight) - 1)]   
weight            = weight[1:(length(weight) - 2)]
corresp_id        = b$unique_id[1:(nrow(b) - 2)]

# Remark: if selected more periods (like 3), then do: by = 3, etc.
corresp_id        = corresp_id[seq(1, length(corresp_id), by=2)]
lfsta_emp         = lfsta_emp[seq(1, length(lfsta_emp)  , by=2)]
lfsta_emp1        = lfsta_emp1[seq(1, length(lfsta_emp1), by=2)] 
lfsta_emp_select  = lfsta_emp_select[seq(1, length(lfsta_emp_select)  , by=2)]
lfsta_emp_select1 = lfsta_emp_select1[seq(1, length(lfsta_emp_select1), by=2)] 
weight            = weight[seq(1, length(weight)        , by=2)]
weight1           = weight1[seq(1, length(weight1)      , by=2)]

data_flow         = as.data.frame(cbind(corresp_id, lfsta_emp, lfsta_emp1, as.numeric(weight), as.numeric(weight1), lfsta_emp_select, lfsta_emp_select1))

sum(table(data_flow$lfsta_emp,data_flow$lfsta_emp_select))
sum(table(data_flow$lfsta_emp))
sum(table(data_flow$lfsta_emp_select))

trans      = as.data.frame.matrix(xtabs(as.numeric(data_flow$V4) ~  data_flow$lfsta_emp_select + data_flow$lfsta_emp_select1))
trans
rowsums    = rowSums(trans)
trans2 = trans
for(i in 1:length(rowsums)) trans2[i, ] = trans2[i, ]/rowsums[i]
trans2
TRANS1     = trans[-1,-1]
rowsums    = rowSums(TRANS1)
for(i in 1:length(rowsums)) TRANS1[i, ] = TRANS1[i, ]/rowsums[i]
TRANS1
# TRANS1     = trans[-2,-2]
# rowsums    = rowSums(TRANS1)
# for(i in 1:length(rowsums)) TRANS1[i, ] = TRANS1[i, ]/rowsums[i]
# TRANS1
# round(TRANS1,3)

# ## union members in the US
# test = read.dta13("/Users/alexandregaillard/Documents/Thesis/Entrepreneurship/ENTREPRENEURSHIP_WORKER_UNEMPLOYED/REVISION_RED/CODE_DATA/US_state_controls/state_union_membership.dta")
# test = test[which(test$sector == "Total"),]
# test_agg = aggregate(pctmem ~ state,data=test,FUN=mean)

#### Transition rate W to U by wage group: CPS ####
load(paste0(url,"CPS/ww_2_tmp_v2023_",frequency,"_correction=",full_correction,".RData"))

## select proper samples.
ww_2_eq_model = ww_2[which(ww_2$WtoNILF == 0 & ww_2$year %in% years_selected_transition),] 

# quantile (using mean wage) #
quant_w <- wtd.quantile(ww_2_eq_model$wage, q = seq(0.1,0.9,by=0.1), weight = ww_2_eq_model$WTFINL, na.rm = TRUE)
mean_w <- wtd.mean(ww_2_eq_model$wage, weights= ww_2_eq_model$WTFINL, na.rm = TRUE)

ww_2_eq_model$quartilewage <- cut(ww_2_eq_model$wage, breaks = c(-Inf, quant_w, Inf),  right = FALSE)
ww_2_eq_model_1 = ww_2_eq_model[which(!is.na(ww_2_eq_model$quartilewage)),]
ww_2_WtoUI    = ww_2_eq_model_1[which(ww_2_eq_model_1$WtoUi == 1),]
ww_2_WtoUl    = ww_2_eq_model_1[which(ww_2_eq_model_1$WtoUl == 1),]
ww_2_WtoU     = ww_2_eq_model_1[which(ww_2_eq_model_1$WtoU == 1),]

## among those who do not necessarily report.
ww_WtoUI    = ww_2_eq_model[which(ww_2_eq_model$WtoUi == 1),]
ww_WtoUl    = ww_2_eq_model[which(ww_2_eq_model$WtoUl == 1),]
ww_WtoU     = ww_2_eq_model[which(ww_2_eq_model$WtoU == 1),]
sum(ww_WtoUI$WTFINL)/sum(ww_WtoU$WTFINL)
sum(ww_WtoU$WTFINL)/sum(ww_2_eq_model$WTFINL)

## across years:
xtabs(WTFINL ~ WtoUi + year,data = ww_2_eq_model)[2,]/xtabs(WTFINL ~ WtoU + year,data = ww_2_eq_model)[2,]

## transition rate:
avg_WtoUI = sum(ww_2_WtoUI$WTFINL)/sum(ww_2_eq_model_1$WTFINL)   ## 50% missing values...
avg_WtoUl = sum(ww_2_WtoUl$WTFINL)/sum(ww_2_eq_model_1$WTFINL)   ## 50% missing values...
avg_WtoU  = sum(ww_2_WtoU$WTFINL)/sum(ww_2_eq_model_1$WTFINL)    ## 50% missing values...

tab1 = xtabs(as.numeric(ww_2_WtoU$WTFINL) ~ ww_2_WtoU$quartilewage)
tab2 = xtabs(as.numeric(ww_2_eq_model_1$WTFINL) ~ ww_2_eq_model_1$quartilewage) 
WWtoUwage = (tab1/tab2)/avg_WtoU
plot(WWtoUwage, type="l", ylab="W to U", xlab="quantile", ylim=range(WWtoUwage))
abline(h=1.0)

## regression
mean_val = tapply(ww_2_eq_model$wage, findInterval(ww_2_eq_model$wage, quant_w), mean)
separation_val = (sum(ww_WtoU$WTFINL)/sum(ww_2_eq_model$WTFINL))*WWtoUwage
plot(separation_val, type="l", ylab="W to U", xlab="quantile")
abline(h=1.0)
lm(separation_val ~ I(mean_val/mean_w) + I((mean_val/mean_w)^2))
model_exit = 0.018
data_exit  = (sum(ww_WtoU$WTFINL)/sum(ww_2_eq_model$WTFINL))
data_separation = as.data.frame(cbind(mean_val/mean_w,separation_val))*(model_exit/data_exit)
ggplot(data=data_separation,aes(x=V1,y=separation_val)) + geom_line() 

## in the model, h value corresponds to:
## 2.54/1.23 = 2.06 --- eta = 0.011  == (0.010880027 - 0.009266767)/(2.8013024-1.7706720)*(2.8013024 - 2.06) + 0.009266767 
## 1/1.23 = 0.78 --- eta = 0.016  == (0.018771918 - 0.014978862)/(0.8113539-0.6774539)*(0.8113539 - 0.78) + 0.014978862 
## 0.395/1.23 = 0.32   --- eta = 0.034  == (0.039305959 - 0.027442434)/(0.4298174-0.2420307)*(0.4298174 - 0.32) + 0.027442434 

tab3 = xtabs(as.numeric(ww_2_WtoUl$WTFINL) ~ ww_2_WtoUl$quartilewage)
tab4 = xtabs(as.numeric(ww_2_WtoUI$WTFINL) ~ ww_2_WtoUI$quartilewage)

tab4/tab1
tab3/tab1

## q probability 
plot(tab4/tab1, type="l", ylab="q probability", xlab="quantile", ylim=c(0,1))
plot(tab3/tab1, type="l", ylab="(1-q)", xlab="quantile", ylim=c(0,1))
mean(tab4/tab1) ## close to our target -- probably


# quantile (using mean wage) #
quant_w <- wtd.quantile(ww_2_eq_model$wage, q = c(0.33,0.66), weight = ww_2_eq_model$WTFINL, na.rm = TRUE)
ww_2_eq_model$quartilewage <- cut(ww_2_eq_model$wage, breaks = c(-Inf, quant_w, Inf), labels = c("33%", "66%", "99%"), right = FALSE)
ww_2_eq_model = ww_2_eq_model[which(!is.na(ww_2_eq_model$quartilewage)),]
ww_2_WtoE     = ww_2_eq_model[which(ww_2_eq_model$WtoSE == 1),]

## transition rate:
avg_WtoE = sum(ww_2_WtoE$WTFINL)/sum(ww_2_eq_model$WTFINL)   ## 50% missing values...

tab1 = xtabs(as.numeric(ww_2_WtoE$WTFINL) ~ ww_2_WtoE$quartilewage)
tab2 = xtabs(as.numeric(ww_2_eq_model$WTFINL) ~ ww_2_eq_model$quartilewage) 
WWtoEwage = (tab1/tab2)/avg_WtoE
names(WWtoEwage) = c("[0:0.33]", "[0.33:0.66]", "[0.66:1]")
plot(WWtoEwage, type="l", ylab="W to E", xlab="Tercile", ylim=range(WWtoEwage))
abline(h=1.0)

## Unemployment rate by theta level -- theta is distributed 25 - 75 percentiles
# quantile (using mean wage) #
quant_w <- wtd.quantile(ww_2_eq_model$wage, q = c(0.33,0.66), weight = ww_2_eq_model$WTFINL, na.rm = TRUE)
ww_2_eq_model$quartilewage <- cut(ww_2_eq_model$wage, breaks = c(-Inf, quant_w, Inf), labels = c("33%", "66%", "99%"), right = FALSE)
ww_2_eq_model = ww_2_eq_model[which(!is.na(ww_2_eq_model$quartilewage)),]
ww_2_WtoU     = ww_2_eq_model[which(ww_2_eq_model$WtoU == 1),]

## transition rate:
avg_WtoU = sum(ww_2_WtoU$WTFINL)/sum(ww_2_eq_model$WTFINL)   ## 50% missing values...

tab1 = xtabs(as.numeric(ww_2_WtoU$WTFINL) ~ ww_2_WtoU$quartilewage)
tab2 = xtabs(as.numeric(ww_2_eq_model$WTFINL) ~ ww_2_eq_model$quartilewage) 
WWtoUwage = (tab1/tab2)/avg_WtoU
names(WWtoUwage) = c("[0:0.33]", "[0.33:0.66]", "[0.66:1]")
plot(WWtoUwage, type="l", ylab="W to U", xlab="Tercile", ylim=range(WWtoUwage))
abline(h=1.0)
WWtoUwage

## target a W to U flow of 2.1% per quarter
1.4715722*2.1
0.8674618*2.1
0.6731261*2.1

